This analysis examines simulated income data for Hungary, focusing on the relationships between income and various demographic factors including age, location, occupation, and gender. The dataset is simulated to reflect realistic patterns while maintaining a manageable size for analysis.
set.seed(123) # For reproducibility
# Define parameters
n <- 10000 # Number of observations
cities <- c("Budapest", "Debrecen", "Szeged", "Miskolc", "Pécs", "Győr", "Szombathely", "Eger")
occupations <- c("Software Developer", "Teacher", "Doctor", "Sales Representative",
"Engineer", "Accountant", "Nurse", "Manager", "Chef", "Driver")
genders <- c("Male", "Female")
# Generate data
data <- data.frame(
age = round(rnorm(n, mean = 40, sd = 12)),
city = sample(cities, n, replace = TRUE),
occupation = sample(occupations, n, replace = TRUE),
gender = sample(genders, n, replace = TRUE)
)
# Ensure age is between 18 and 70
data$age <- pmax(18, pmin(70, data$age))
# Generate income based on relationships
data <- data %>%
mutate(
# Base income
base_income = 300000,
# Age effect (quadratic relationship)
age_effect = 10000 * (age - 40) - 100 * (age - 40)^2,
# City effect
city_effect = case_when(
city == "Budapest" ~ 100000,
city %in% c("Debrecen", "Szeged") ~ 50000,
TRUE ~ 0
),
# Occupation effect
occupation_effect = case_when(
occupation == "Software Developer" ~ 150000,
occupation == "Doctor" ~ 120000,
occupation == "Manager" ~ 100000,
occupation == "Engineer" ~ 80000,
occupation == "Accountant" ~ 60000,
occupation == "Teacher" ~ 40000,
occupation == "Nurse" ~ 30000,
occupation == "Chef" ~ 20000,
occupation == "Driver" ~ 10000,
TRUE ~ 0
),
# Gender effect (simulating gender pay gap)
gender_effect = ifelse(gender == "Male", 20000, 0),
# Random noise
noise = rnorm(n, 0, 50000)
) %>%
mutate(
income = base_income + age_effect + city_effect + occupation_effect + gender_effect + noise
) %>%
select(age, city, occupation, gender, income)
# Save the data
write.csv(data, "hungarian_income_data.csv", row.names = FALSE)# Summary statistics with kable
summary_stats <- summary(data)
kable(summary_stats, caption = "Summary Statistics of the Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| age | city | occupation | gender | income | |
|---|---|---|---|---|---|
| Min. :18.0 | Length:10000 | Length:10000 | Length:10000 | Min. :-112471 | |
| 1st Qu.:32.0 | Class :character | Class :character | Class :character | 1st Qu.: 289781 | |
| Median :40.0 | Mode :character | Mode :character | Mode :character | Median : 389012 | |
| Mean :40.1 | NA | NA | NA | Mean : 382371 | |
| 3rd Qu.:48.0 | NA | NA | NA | 3rd Qu.: 479762 | |
| Max. :70.0 | NA | NA | NA | Max. : 843293 |
# Interactive income distribution by gender
p1 <- ggplot(data, aes(x = income, fill = gender)) +
geom_density(alpha = 0.6) +
scale_fill_viridis_d() +
labs(title = "Income Distribution by Gender",
subtitle = "Density plot showing the distribution of income across genders",
x = "Income (HUF)",
y = "Density") +
custom_theme
ggplotly(p1)# Interactive income by occupation with jitter
p2 <- ggplot(data, aes(x = reorder(occupation, income, FUN = median), y = income, color = occupation)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(alpha = 0.1, width = 0.2) +
scale_color_viridis_d() +
coord_flip() +
labs(title = "Income Distribution by Occupation",
subtitle = "Boxplot with individual data points",
x = "Occupation",
y = "Income (HUF)") +
custom_theme
ggplotly(p2)# Interactive income by city with violin plot
p3 <- ggplot(data, aes(x = reorder(city, income, FUN = median), y = income, fill = city)) +
geom_violin(alpha = 0.7) +
geom_boxplot(width = 0.2, alpha = 0.5) +
scale_fill_viridis_d() +
coord_flip() +
labs(title = "Income Distribution by City",
subtitle = "Violin plot with boxplot overlay",
x = "City",
y = "Income (HUF)") +
custom_theme
ggplotly(p3)# Interactive age vs income scatter plot with regression line
p4 <- ggplot(data, aes(x = age, y = income, color = gender)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = TRUE) +
scale_color_viridis_d() +
labs(title = "Relationship between Age and Income",
subtitle = "Scatter plot with LOESS regression line",
x = "Age",
y = "Income (HUF)") +
custom_theme
ggplotly(p4)# Create a correlation heatmap
numeric_data <- data %>% select(age, income)
cor_matrix <- cor(numeric_data)
cor_df <- as.data.frame(cor_matrix)
cor_df$var1 <- rownames(cor_df)
cor_df_long <- pivot_longer(cor_df, -var1, names_to = "var2", values_to = "correlation")
ggplot(cor_df_long, aes(x = var1, y = var2, fill = correlation)) +
geom_tile() +
scale_fill_viridis() +
labs(title = "Correlation Heatmap",
subtitle = "Correlation between numeric variables",
x = "",
y = "") +
custom_theme +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Test if there's a significant difference in income between genders
t_test_result <- t.test(income ~ gender, data = data)
print(t_test_result)##
## Welch Two Sample t-test
##
## data: income by gender
## t = -5.9705, df = 9996.2, p-value = 2.446e-09
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -21941.72 -11095.24
## sample estimates:
## mean in group Female mean in group Male
## 374135.3 390653.8
# Test if there are significant differences in income between cities
city_anova <- aov(income ~ city, data = data)
print(summary(city_anova))## Df Sum Sq Mean Sq F value Pr(>F)
## city 7 1.296e+13 1.851e+12 103.3 <2e-16 ***
## Residuals 9992 1.790e+14 1.792e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test if there are significant differences in income between occupations
occupation_anova <- aov(income ~ occupation, data = data)
print(summary(occupation_anova))## Df Sum Sq Mean Sq F value Pr(>F)
## occupation 9 2.342e+13 2.603e+12 154.2 <2e-16 ***
## Residuals 9990 1.686e+14 1.687e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test if income distributions differ between genders
male_income <- data$income[data$gender == "Male"]
female_income <- data$income[data$gender == "Female"]
ks_test <- ks.test(male_income, female_income)
print(ks_test)##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: male_income and female_income
## D = 0.048592, p-value = 1.492e-05
## alternative hypothesis: two-sided
# Test if income distributions differ between cities
kruskal_test <- kruskal.test(income ~ city, data = data)
print(kruskal_test)##
## Kruskal-Wallis rank sum test
##
## data: income by city
## Kruskal-Wallis chi-squared = 638.84, df = 7, p-value < 2.2e-16
# Convert categorical variables to factors
data$city <- as.factor(data$city)
data$occupation <- as.factor(data$occupation)
data$gender <- as.factor(data$gender)
# Fit the model
model1 <- lm(income ~ age + I(age^2) + city + occupation + gender, data = data)
# Create a beautiful model summary table
model_summary <- summary(model1)
kable(tidy(model_summary), caption = "Multiple Linear Regression Results") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -96092.80004 | 5346.84928 | -17.971855 | 0 |
| age | 17807.38096 | 247.50834 | 71.946590 | 0 |
| I(age^2) | -98.24611 | 2.98482 | -32.915255 | 0 |
| cityDebrecen | -48896.22562 | 2037.36766 | -23.999706 | 0 |
| cityEger | -100281.00819 | 2029.78290 | -49.404795 | 0 |
| cityGyőr | -100731.39605 | 2020.96342 | -49.843255 | 0 |
| cityMiskolc | -100072.32151 | 2013.43652 | -49.702248 | 0 |
| cityPécs | -99161.76998 | 2015.48565 | -49.199939 | 0 |
| citySzeged | -51320.31355 | 2009.39904 | -25.540131 | 0 |
| citySzombathely | -100440.01966 | 2014.50213 | -49.858483 | 0 |
| occupationChef | -38632.37074 | 2263.24122 | -17.069489 | 0 |
| occupationDoctor | 61046.06725 | 2267.92997 | 26.917086 | 0 |
| occupationDriver | -48852.44645 | 2239.32545 | -21.815697 | 0 |
| occupationEngineer | 21662.39493 | 2249.52488 | 9.629765 | 0 |
| occupationManager | 43139.45933 | 2253.96387 | 19.139375 | 0 |
| occupationNurse | -28367.40970 | 2252.92890 | -12.591347 | 0 |
| occupationSales Representative | -62327.70195 | 2247.52268 | -27.731734 | 0 |
| occupationSoftware Developer | 91320.22019 | 2267.71522 | 40.269704 | 0 |
| occupationTeacher | -16321.55449 | 2226.30713 | -7.331223 | 0 |
| genderMale | 19165.10817 | 997.20292 | 19.218865 | 0 |
# Enhanced model diagnostics
par(mfrow = c(2,2))
plot(model1, col = income_palette[1], pch = 19, cex = 0.7)# Fit polynomial regression
model2 <- lm(income ~ poly(age, 3), data = data)
# Create an interactive plot
p5 <- ggplot(data, aes(x = age, y = income)) +
geom_point(alpha = 0.1, color = income_palette[1]) +
geom_smooth(method = "lm", formula = y ~ poly(x, 3),
color = income_palette[5], fill = income_palette[5], alpha = 0.2) +
labs(title = "Polynomial Regression: Age vs Income",
subtitle = "Cubic polynomial fit with confidence interval",
x = "Age",
y = "Income (HUF)") +
custom_theme
ggplotly(p5)# Model comparison
model_comparison <- data.frame(
Model = c("Multiple Linear", "Polynomial"),
R_squared = c(summary(model1)$r.squared, summary(model2)$r.squared),
Adj_R_squared = c(summary(model1)$adj.r.squared, summary(model2)$adj.r.squared)
)
kable(model_comparison, caption = "Model Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Model | R_squared | Adj_R_squared |
|---|---|---|
| Multiple Linear | 0.8710142 | 0.8707686 |
| Polynomial | 0.6818015 | 0.6817060 |
# Create a sample prediction
new_data <- data.frame(
age = 35,
city = "Budapest",
occupation = "Software Developer",
gender = "Male"
)
# Predict income
prediction <- predict(model1, newdata = new_data, interval = "prediction")
print(prediction)## fit lwr upr
## 1 517299.4 419559.1 615039.7
The analysis of the simulated Hungarian income data reveals several interesting patterns:
The analysis demonstrates the complex interplay between various factors affecting income levels in Hungary. The findings suggest that both demographic factors (age, gender) and professional factors (occupation, location) play important roles in determining income levels.